library("quanteda")
library("topicmodels")
library("readtext")
library(readr)
library(tidytext)
library(dplyr)
library(ggplot2)

d_data <- read_csv(# specify location make sure file contains only one column!
d_data$text <- d_data$X1 # omdat corpus een variable nodig heeft die "text" heet, kopieren we deze
d_data$X1 <- NULL # we verwijderen X1
d_corpus <- corpus(d_data) # translate into corpus
d_tokens <- tokens(d_corpus) #split hele file op in tokens, apparte woorden

# breidt het stopwoordenboek wat uit in functie van de data
new_stopwords <- c("👏","👇","🔵", "🌹","✅","🇬🇧","💙", "✔","👉","🚨","❌",
                   "🇬🇧","🌳","🗳","👍","🗣","📢","📆","⬇","⤵",
                   "minut*", "hour*", "amp", "http*", "@*", "#*", "even", "will", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "extra", "use", "rail", "today", "yesterday", "tomorrow", "now", "full",  "bring", "big", "say", "meet", "across", "add", "step", "s", "t", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "summer", "winter", "spring", "autumn", "ever", "pm", "am", "rt", "back", "like", "still", "see", "end", "first", "many", "come", "yet", "real", "find", "sure", "never", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "day", "week", "month", "always", "since", "become", "keep", "let", "can", "ahead", "new", "hard", "made", "ago", "just", "still", stopwords("english"))
# translate into dataframe en verwijder stopwoorden, punten en nummers
d_dfm <- dfm(d_corpus, remove = new_stopwords, remove_punct = TRUE, remove_numbers = TRUE, stem = TRUE)
nary_MFD, package = "quanteda.dictionaries")
# number of "words" in each MFD dictionary key
lengths(data_dictionary_MFD)
# first 5 values in each dictionary key
lapply(data_dictionary_MFD, head, 5)

# number of words in a text matching the MFD dictionary
dfm(d_dfm) %>%
  dfm_lookup(dictionary = data_dictionary_MFD) %>%
  tail()



# If you're interested only in MFs, remake the dictionary into nested catetgory of foundation and valence
data_dictionary_MFDnested <-
  dictionary(list(
    care = list(
      virtue = data_dictionary_MFD[["care.virtue"]],
      vice = data_dictionary_MFD[["care.vice"]]
    ),
    fairness = list(
      virtue = data_dictionary_MFD[["fairness.virtue"]],
      vice = data_dictionary_MFD[["fairness.vice"]]
    ),
    loyalty = list(
      virtue = data_dictionary_MFD[["loyalty.virtue"]],
      vice = data_dictionary_MFD[["loyalty.vice"]]
    ),
    authority = list(
      virtue = data_dictionary_MFD[["authority.virtue"]],
      vice = data_dictionary_MFD[["authority.vice"]]
    ),
    sanctity = list(
      virtue = data_dictionary_MFD[["sanctity.virtue"]],
      vice = data_dictionary_MFD[["sanctity.vice"]]
    )
  ))

#Inspecting this we can see details on the dictionary:
lengths(data_dictionary_MFDnested)
lapply(data_dictionary_MFDnested, lengths)
# now apply it to texts
dfm(d_dfm) %>%
  dfm_lookup(dictionary = data_dictionary_MFDnested, levels = 1) %>%
  tail()

dfm(d_dfm) %>%
  dfm_lookup(dictionary = data_dictionary_MFDnested, levels = 2) %>%
  tail()

#  (#Specifying both levels (or the default of levels = 1:5) 
# matches what we had originally with the flattened dictionary:

dfm(d_dfm) %>%
  dfm_lookup(dictionary = data_dictionary_MFDnested, levels = 1) %>%
  tail()

dfm(d_dfm) %>%
  dfm_lookup(dictionary = data_dictionary_MFDnested, levels = 2) %>%
  tail()

dfm(d_dfm) %>%
  dfm_lookup(dictionary = data_dictionary_MFDnested, levels = 1:2) %>%
  tail()

care <- list(
  virtue = data_dictionary_MFD[["care.virtue"]],
  vice = data_dictionary_MFD[["care.vice"]]
)
fairness <- list(
  virtue = data_dictionary_MFD[["fairness.virtue"]],
  vice = data_dictionary_MFD[["fairness.vice"]]
)
loyalty <- list(
  virtue = data_dictionary_MFD[["loyalty.virtue"]],
  vice = data_dictionary_MFD[["loyalty.vice"]]
)
authority <- list(
  virtue = data_dictionary_MFD[["authority.virtue"]],
  vice = data_dictionary_MFD[["authority.vice"]]
)
sanctity <- list(
  virtue = data_dictionary_MFD[["sanctity.virtue"]],
  vice = data_dictionary_MFD[["sanctity.vice"]]
)

sanctity_dictionary <- dictionary(sanctity)
dfm_results_sanctity <- dfm_lookup(d_dfm, dictionary = sanctity_dictionary,valuetype = "glob", case_insensitive = TRUE) 
textplot_wordcloud(dfm_trim(dfm_results_sanctity, min_termfreq = 1, max_words = 40), comparison = FALSE, stem = TRUE)

